Introduction

RNA-seq was run on 36 ovarian cancer cell lines, each in singlicate.

All 36 cell lines have 72h cisplatin IC50s determined by Kristin Adams and Kendra Wendt in the Lang Lab.

Add in links to Lab notebooks for IC50, RNAseq sample prep

Inputs

library(knitr)
knit("src/load_inputs.Rmd")


processing file: src/load_inputs.Rmd

  |                                                                     
  |                                                               |   0%
  |                                                                     
  |.....                                                          |   8%                      
  |                                                                     
  |..........                                                     |  17% [load packages]      
  |                                                                     
  |................                                               |  25%                      
  |                                                                     
  |.....................                                          |  33% [unnamed-chunk-1]    
  |                                                                     
  |..........................                                     |  42%                      
  |                                                                     
  |................................                               |  50% [load metatable-full]

  |                                                                     
  |.....................................                          |  58%                      
  |                                                                     
  |..........................................                     |  67% [load metatable]     

  |                                                                     
  |...............................................                |  75%                      
  |                                                                     
  |....................................................           |  83% [read count matrix]  
  |                                                                     
  |..........................................................     |  92%                      
  |                                                                     
  |...............................................................| 100% [unnamed-chunk-2]    
                                                                                                                               
output file: load_inputs.md
[1] "load_inputs.md"

Subset Metadata to exclude PEO4, PEO6, and PEA1

metadata.subset = metadata[!(metadata$CellLine %in% c("PEO4", "PEO6", "PEA1")),]
countmatrix2.subset = countmatrix2[, colnames(countmatrix2) %in% metadata.subset$files]
TPM2.subset = TPM2[, colnames(TPM2) %in% metadata.subset$CellLine]
TPM.log.subset = TPM.log[, colnames(TPM.log) %in% metadata.subset$files]

print(countmatrix2.subset)
                           X14    X22    X21    X31    X30     X17    X32     X13      X5
A1BG                        52     45     48      2      4      27     16      37      30
A1BG-AS1                   108     80     88     15      6     107     34      80      28
A1CF                         0      0      0      0      0       0      1       0       0
A2M                       1161      0     10      7     23   11639      1      11      38
A2M-AS1                     15     11     99     52      8       4     10       5       4
A2ML1                        7     47      2      0      9      20     23       2       0
A2MP1                        0      0      0      0      0       0      0       0       0
A3GALT2                      0      0      0      0      0       0      0       0       0
A4GALT                     369     39    284    153     57     154     26     143      37
A4GNT                        0      0      0      0      0       0      0       0       0
AA06                         0      0      0      0      0       0      0       0       0
AAAS                       889   1392   1587    677   1397    1023   1416    1153    1454
AACS                      2836   1984    713   1344   1113    1350   1340     686     639
AACSP1                       2      0      1      4      0       0      1       1       0
AADAC                        2      2      6      0      3       4     18       0      73
AADACL2                      0      0      0      0      1       0      5       0       0
AADACL3                      0      0      0      0      0       0      0       0       0
AADACL4                      0      0      0      0      0       6      0       0       2
AADAT                      235    296    633    118    223      74    436     187     225
AAED1                      348    321    891    554    309     441    233     658     215
AAGAB                     4120   3392   3090   1630   3544    2878   4825    3824    1508
AAK1                      3673   2172   2322   2244   2444    4453   2252    2593    3770
AAMDC                      215    216    491    237    258     511   2129     130     146
AAMP                      2497   3582   2349   1826   1898    3290   4554    4058    1826
AANAT                        2      0      0      0      1       1      5       1       2
AAR2                       565   2711   1263    769   1468    1357   1720     937    1482
AARD                       351    519    322     61     70      39    452     121      98
AARS                      4174   8162   7419   7563   7822    5340   2623    7486    5111
AARS2                      733    817    564    379    875    1129    583     851     895
AARSD1                     159    855    482    423    809     347    396     411     343
AASDH                      644    477    689    688    636     624    665     236     315
AASDHPPT                  1261   1917   2510   2203   2379    4342   1308    2985    2821
AASS                       586    174    637    206     42       4    120      19     754
AATF                      1382   5882   2831   2341   3856    1354   2917    3659    4013
AATK                        95     19     17      6     36      25     65      14       5
AATK-AS1                     0      0      0      0      0       0      0       0       0
ABAT                       107    743    126    232     92    1110    195       8     633
ABCA1                       40    287   1755    190    413    1037    345     550    1828
ABCA10                       3      5      7      3      0       8      2       1       6
ABCA11P                    191    549    209    110    193     347    287     147     328
ABCA12                      42      8     27     41      0      47     21       8       2
ABCA13                      30     75     67    142      2       0      2       0       0
ABCA17P                      0      3      1      0      1       3      5       0      23
ABCA2                      622    781    801    457    554    1231    732     690    1230
ABCA3                      546    156   1039    252    237     267    271     136     245
ABCA4                        9    142     21     57    156     417     94      46     173
ABCA5                      650    164    233    356    135     205    152      87     268
ABCA6                        0      1      1      0      0       4      0       0       0
ABCA7                      114    125    121    278     90     165    100      92     125
ABCA8                        3      6      0      1      0       6      3       0       1
ABCA9                        0      1      8      2      0      76      1       0       2
ABCB1                      170     54     41    116     29       0      1       3       8
ABCB10                    1225    629    732    832   1646     928    394     683    1609
ABCB11                       0      0      0      0      0       0     10       0       0
ABCB4                       19     70     12      9      0       0     12       0       0
ABCB5                        1     56     33      0     33     223    103       9      78
ABCB6                      620    663    133    241    208     495    676     947     147
ABCB7                     1583    975   1422   1400   2329    1379   1573    1179    1414
ABCB8                      515    574    500    205    341     857    260     343     559
ABCB9                      202    157     74     40     72     151    144     104      49
ABCC1                     1279   4911   4665   2207   2293    5168   3316    7774    2832
ABCC10                     293    242    199    134    180     492    283     332     206
ABCC11                       2      3     11      4      5       7      9       0       2
ABCC12                       0      0      0      2      0       0      0       1       0
ABCC13                       1      0      0      2      0       0     13       0       0
ABCC2                       10     23     24     47     45      29      7       6      14
ABCC3                     2445    255   1201    488    459     423     89      35      81
ABCC4                      455   1054   1383   1645    807    1645    881    2064    6141
ABCC5                     1828   1201    592    724    612    1836    497    1073    1360
ABCC5-AS1                    0      0      0      1      0       0      0       0       0
ABCC6                       50     91     30      4     27      72     20      76       7
ABCC6P1                     24     18      1      5      4      43      9       0       0
ABCC6P2                     36     34     17      0      3      12     14      19       6
ABCC8                       16      0      1      0      0       2      7       0       0
ABCC9                        1      0      3      6      0       1      1       0       0
ABCD1                      379    104    452    221    229     363    240     192     389
ABCD2                        4      0      0      0      0       0      0       0       1
ABCD3                     4013   2019   1834   2817   4969    5719   2284    4875    5167
ABCD4                      352    352    371    309    133     517   1152     480     203
ABCE1                     5269   6565   6960   4720  12020    5439   7210   10354    5768
ABCF1                     7913   4562   4849   2529   4211    6006   6770    4739    3134
ABCF2                     3929   4840   6686   2916   4970    5439   2695    5436    4722
ABCF3                     2458   1556   1146   1123   1317    1298   1457    1612    1458
ABCG1                      157     86     63      5     46     251     43     100     535
ABCG2                      263     43    223    357     47      11     12      23      26
ABCG4                       13     16      9      2     18      15     17       3      25
ABCG5                        3      5      0      0      1       0      0       0       0
ABCG8                        0      1      0      1      0       0      3       0       0
ABHD1                       22      7      9      4      5       0     15       7       4
ABHD10                    1684   1800    932    707   1198    1374   1304    1582     838
                          X24     X33
A1BG                        4       1
A1BG-AS1                   20       9
A1CF                        0       0
A2M                         0     113
A2M-AS1                    21      26
A2ML1                       0      21
A2MP1                       0       0
A3GALT2                     0       0
A4GALT                    349      82
A4GNT                       0       0
AA06                        0       0
AAAS                      665    1172
AACS                     1051    1081
AACSP1                      0       2
AADAC                     148       3
AADACL2                     0       0
AADACL3                     0       0
AADACL4                     0       0
AADAT                      30     211
AAED1                     361     303
AAGAB                    3164    2449
AAK1                     1415    2508
AAMDC                      73     267
AAMP                     2175    1920
AANAT                       0       0
AAR2                      695    1066
AARD                        3     298
AARS                     5132    6165
AARS2                     355     279
AARSD1                    570     512
AASDH                     692    1034
AASDHPPT                 1652    2215
AASS                      269     231
AATF                     4076    3884
AATK                        0       4
AATK-AS1                    0       0
ABAT                        7     381
ABCA1                     204     452
ABCA10                    240       5
ABCA11P                   113     264
ABCA12                     84      66
ABCA13                     17      20
ABCA17P                     0       5
ABCA2                     554     410
ABCA3                     116    1319
ABCA4                      12    4652
ABCA5                     219     219
ABCA6                       4       0
ABCA7                     231      53
ABCA8                       0       0
ABCA9                       3       0
ABCB1                       0      11
ABCB10                    937     891
ABCB11                      1       0
ABCB4                       3      21
ABCB5                       0       0
ABCB6                      69      32
ABCB7                     811    1159
ABCB8                      73     882
ABCB9                      42      70
ABCC1                    2692    4727
ABCC10                     62     145
ABCC11                      1       3
ABCC12                      0       0
ABCC13                      0       1
ABCC2                      12      73
ABCC3                    2115    1089
ABCC4                    1223     263
ABCC5                     876    1375
ABCC5-AS1                   0       0
ABCC6                      11      53
ABCC6P1                     2      14
ABCC6P2                    17      28
ABCC8                       0       0
ABCC9                       0      49
ABCD1                      92     225
ABCD2                       0       1
ABCD3                    2098    3728
ABCD4                     239     415
ABCE1                    5612    4200
ABCF1                    3112    1545
ABCF2                    1331    5274
ABCF3                    1079    1323
ABCG1                       7      57
ABCG2                     297      39
ABCG4                       0      31
ABCG5                       0       0
ABCG8                       0       1
ABHD1                       7       2
ABHD10                    940    1313
 [ reached getOption("max.print") -- omitted 29654 rows ]

DESeq2

Create DESeqDataSet object dds

dds.subset <- DESeqDataSetFromMatrix(countData = countmatrix2.subset,
                                      colData = metadata.subset,
                                      design = ~ PlatinumSensitivity)
converting counts to integer mode

Don’t need to filter - this is done automatically

Run DESeq2

path = paste0(deseq_Rdata_folder,
              "/dds_HGSC_subset.RData")

# Run DESeq
# dds.subset <- DESeq(dds.subset)
# save(dds.subset, file = path)

# Alternatively, if this has already been run, can load dds object
load(path)

Filter DESeq2 results for significant genes

Filter res.subset for padj < 0.05 and |log2FC| >= 1.2

res.subset.filtered <- as.data.frame(res.subset) %>%
  filter(padj<0.05)%>%
  filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.subset.filtered

Data QC

Data transformation

Here we performed normal transformation [log2(n+1)], variance stabilized transformation, and regularized log tranformation to improve visualization of the data values. To speed up subsequent re-runs, we have hidden analysis for non-vst.

# ntd <- normTransform(dds.subset)
# meanSdPlot(assay(ntd))
vsd.subset <- vst(dds.subset)
meanSdPlot(assay(vsd.subset))

vsd.subset.df = as.data.frame(assay(vsd.subset))
# rld <- rlog(dds.subset)
# meanSdPlot(assay(rld))

Based on this data, variance-stabilized transformation lead to the lowest standard deviation between samples and was mostly located towards the high expression transcripts, as might be expected.

PCA plot

pcaData.subset <- plotPCA(vsd.subset, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData.subset, "percentVar"))
myColors <- c("#76AB7E", "#63E678", "#1D32FB", "#7B87FD", "#01BD1F", "#E8A426", "#0B7C1D", "#BB19E7")
names(myColors) <- levels(vsd.subset$Subtype)
colScale <- scale_colour_manual(name = "Subtype",values = myColors)
pca.subset <- ggplot(pcaData.subset, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
  geom_point(size=3) +
  geom_text(hjust=0, vjust=0) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  theme_classic() + 
  colScale
pca.subset
path = paste0(deseq_output_folder,
              "/pca_HGSC_subset.pdf")
ggsave(path, pca.subset)
Saving 7.29 x 4.51 in image

PCA plot groups samples roughly by subtype

Plot heatmaps to check sample to sample variability

Plotting the top 100 most highly expressed genes:

select <- order(rowMeans(counts(dds.subset,normalized=TRUE)),
                decreasing=TRUE)[1:100]
df <- as.data.frame(colData(dds.subset)[,c("Subtype", "PlatinumSensitivity")])
pheatmap(assay(vsd.subset)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         labels_col=colData(dds.subset)[,c("CellLine")],annotation_col=df)

Dendrogram based on gene expression

Pull genes contributing to principal components 1 & 2

# Not sure what this was for:
# PCA <- prcomp(TPM.log, scale=TRUE)
# PCA.mat <- as.data.frame(PCA$x)
# PCA.PC1filt <- PCA.mat %>% filter(PC1 < quantile(PCA.mat[,"PC1"], .2)[[1]])
TPM3 <- TPM2.subset[rowSums(TPM2.subset)>1000,]
counts.sc <- t(TPM3)
dist <- dist(counts.sc)
clust <- hclust(dist, method="average")
dend1.subset <- as.dendrogram(clust)
par(mar=c(10,2,1,1))
my_colors <- ifelse(metadata.subset$PlatinumSensitivity=="sensitive", "red", 
                    ifelse(metadata.subset$PlatinumSensitivity=="resistant", "blue", 
                           ifelse(metadata.subset$PlatinumSensitivity=="intermediate", "yellow", "white" )))
plot(dend1.subset)
colored_bars(colors = my_colors, dend = dend1.subset, rowLabels = "PlatinumSensitivity")

I tried a lot of different iterations of this (including: various cutoffs for variance of genes, genes contributing most to first and second principal components, more highly expressed genes, TPM vs vst data, clustering methods), but the fundamental problem is that isogenic pairs rarely cluster anywhere near each other, even though the PCA analysis shows this relationship. I don’t know that this is a reliable method for determining relatedness/subtyping, unless a robust gene set is developed.(This is from Jessi)

Data visualization

Plot Heatmap of top differentially expressed genes

vsd.subset.df <-as.data.frame(assay(vsd.subset))
pheatmap(vsd.subset.df[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))

Heatmap based on TPM

pheatmap(TPM.log.subset[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")], annotation_col=df, color=colorRampPalette(c("white", "red"))(10))

Heatmap with mean-centered data

center_scale <- function(x) {
  scale(x, scale=FALSE)
}
vsd.subset.meancenter <- apply(vsd.subset.df, 1, center_scale)
vsd.subset.meancenter <-t(vsd.subset.meancenter)
colnames(vsd.subset.meancenter) <- colnames(vsd.subset.df)
vsd.subset.meancenter <- as.data.frame(vsd.subset.meancenter) 
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
extreme_val = max(
  abs(min(vsd.subset.meancenter[rownames(res.subset.filtered),])),
  abs(max(vsd.subset.meancenter[rownames(res.subset.filtered),]))
)
myBreaks <- c(seq(-1 * extreme_val, 0, length.out=ceiling(50/2) + 1), 
              seq(extreme_val/50, extreme_val, length.out=floor(50/2)))

plotHeatmap = pheatmap(vsd.subset.meancenter[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks)

print(plotHeatmap)

ggsave(filename = paste0(deseq_plots_folder, "/differential_hgsc_subset_heatmap.svg"), plot = plotHeatmap, device = "svg")
Saving 8 x 10 in image

Volcano Plots

# These genes chosen by Jessi for the purpose of figures in paper
selected_labels = c("SULT1A3", "CCL5", "FN1", "CD70", "IL1RN", "ZEB2", "A2M-AS1", "TAP2", "CEBPA", "BARX1", "COL2A1", "CT45A5")
labels = ifelse(rownames(res.subset) %in% selected_labels,
               rownames(res.subset),
               "")
# test_points = res.subset[rownames(res.subset) %in% selected_labels,]
# print(as.data.frame(test_points))
# labels = rownames(test_points)

plotVolcano = res.subset %>% 
  ggplot(aes(x=log2FoldChange, y=-log10(padj), label=labels)) +
  geom_point(alpha=0.5) +
  theme_minimal() +
  scale_color_manual(values = c("black", "blue", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  geom_text_repel(segment.size = .2, size = 2, max.overlaps = 1000, force = 10, force_pull = 1, nudge_y = 1)
  # geom_text_repel(segment.size = .2, size = 2, max.overlaps = 10, force = 1, force_pull = 60, nudge_y = 1)
  

print(plotVolcano)
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

ggsave(filename = paste0(deseq_plots_folder, "/differential_hgsc_subset_volcano.svg"), plot = plotVolcano, device = "svg")
Saving 7 x 7 in image
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12183 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

Gene ontology

GSEA of genes differentially expressed in resistant and sensitive lines

Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.

Using a padj Cutoff of 0.15.

gene_list.subset <- res.subset$log2FoldChange
names(gene_list.subset) <- rownames(res.subset)
gene_list.subset <- sort(gene_list.subset, decreasing = TRUE)

# Set the seed so our results are reproducible:
set.seed(2024)
gsea_res.subset <- gseGO(gene_list.subset, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.15)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).

preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam,  :
  There are ties in the preranked stats (9.17% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
# Format output
gsea_res_df.subset <- as.data.frame(gsea_res.subset)
gsea_res_df.subset <- gsea_res_df.subset %>%
  mutate(original_row_num = row_number())
gsea_res_df.subset <- gsea_res_df.subset[order(gsea_res_df.subset$NES, decreasing = TRUE),]
row.names(gsea_res_df.subset) <- gsea_res_df.subset$ID

NES is the normalized enrichment score.

gsea_res_df_short.subset <- gsea_res_df.subset[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short.subset$"core_enrichment_genes" <- gsea_res_df.subset$core_enrichment

Volcano Plot of gene ontology (Average NES & adjusted p value)

as.data.frame(gsea_res_df_short.subset) %>%
  ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.subset))) +
  geom_point() +
  scale_alpha_manual(0.5) +
  theme_minimal() +
  geom_text_repel() +
  geom_hline(yintercept = 1.301) +
  geom_vline(xintercept = 1.2) +
  geom_vline(xintercept = -1.2) +
  xlim(-10, 10)
Warning: ggrepel: 59 unlabeled data points (too many overlaps). Consider increasing max.overlaps

Upregulated pathways

gsea_res_df_short.up.subset <- subset(gsea_res_df_short.subset, gsea_res_df_short.subset$NES >= 0)
gsea_res_df_short.up.subset = gsea_res_df_short.up.subset[order(gsea_res_df_short.up.subset$p.adjust), ]
path = paste0(deseq_output_folder,
              "/subset_cisplatin_resistant_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up.subset, file = path)
gsea_res_df_short.up.subset

Use Revigo to cluster upregulated pathways with pvalue < 0.15

revigo_input.cellline.up.subset <- gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.15,][c("p.adjust")]
rownames(revigo_input.cellline.up.subset) <- rownames(gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.15,])

simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up.subset),
  orgdb = "org.Hs.eg.db",
  ont = "BP",
  method = "Rel"
)
Warning in GOSemSim::godata(orgdb, ont = ont, keytype = keytype) :
  use 'annoDb' instead of 'OrgDb'
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up.subset$p.adjust), rownames(revigo_input.cellline.up.subset))

if (nrow(revigo_input.cellline.up.subset) > 1) {
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = "org.Hs.eg.db"
  )
} else {
  reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
  print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
'select()' returned 1:many mapping between keys and columns

Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).

if (nrow(reducedTerms) > 2) {
  treemapPlot(reducedTerms)
}

Use Revigo to cluster upregulated pathways with pvalue < 0.05

revigo_input.cellline.up.subset <- gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.05,][c("p.adjust")]
rownames(revigo_input.cellline.up.subset) <- rownames(gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.05,])

simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up.subset),
  orgdb = "org.Hs.eg.db",
  ont = "BP",
  method = "Rel"
)
Warning in GOSemSim::godata(orgdb, ont = ont, keytype = keytype) :
  use 'annoDb' instead of 'OrgDb'
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up.subset$p.adjust), rownames(revigo_input.cellline.up.subset))

if (nrow(revigo_input.cellline.up.subset) > 1) {
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = "org.Hs.eg.db"
  )
} else {
  reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
  print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
'select()' returned 1:many mapping between keys and columns

Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).

if (nrow(reducedTerms) > 2) {
  treemapPlot(reducedTerms)
}

Downregulated pathways

gsea_res_df_short.down.subset <- subset(gsea_res_df_short.subset, gsea_res_df_short.subset$NES <= 0)
path = paste0(deseq_output_folder,
              "/subset_cisplatin_resistant_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down.subset, file = path)
gsea_res_df_short.down.subset

Use Revigo to cluster downregulated pathways with pvalue < 0.15

revigo_input.cellline.down.subset <- gsea_res_df_short.down.subset[gsea_res_df_short.down.subset$p.adjust < 0.15,][c("p.adjust")]
rownames(revigo_input.cellline.down.subset) <- rownames(gsea_res_df_short.down.subset[gsea_res_df_short.down.subset$p.adjust < 0.15,])

simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down.subset),
  orgdb = "org.Hs.eg.db",
  ont = "BP",
  method = "Rel"
)
Warning in GOSemSim::godata(orgdb, ont = ont, keytype = keytype) :
  use 'annoDb' instead of 'OrgDb'
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down.subset$p.adjust), rownames(revigo_input.cellline.down.subset))

if (nrow(revigo_input.cellline.down.subset) > 1) {
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = "org.Hs.eg.db"
  )
} else {
  reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
  print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
'select()' returned 1:many mapping between keys and columns

Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).

if (nrow(reducedTerms) > 2) {
  treemapPlot(reducedTerms)
}

---
title: "DESeq: Ovarian Cancer Cell Line RNAseq Platinum Sensitivity"
output: html_notebook
---

## Introduction  

RNA-seq was run on 36 ovarian cancer cell lines, each in singlicate.  

All 36 cell lines have 72h cisplatin IC50s determined by Kristin Adams and Kendra 
Wendt in the Lang Lab. 

<!-- ![Cisplatin IC50s](CisplatinIC50.jpg){height=50%, width=50%} -->

Add in links to Lab notebooks for IC50, RNAseq sample prep

## Inputs

```{r inputs}
library(knitr)
knit("src/load_inputs.Rmd")
deseq_plots_folder = "data/deseq/output"
```

Subset Metadata to exclude PEO4, PEO6, and PEA1

```{r}
metadata.subset = metadata[!(metadata$CellLine %in% c("PEO4", "PEO6", "PEA1")),]
countmatrix2.subset = countmatrix2[, colnames(countmatrix2) %in% metadata.subset$files]
TPM2.subset = TPM2[, colnames(TPM2) %in% metadata.subset$CellLine]
TPM.log.subset = TPM.log[, colnames(TPM.log) %in% metadata.subset$files]

print(countmatrix2.subset)
```

## DESeq2

### Create DESeqDataSet object dds

```{r DESeqDataSet generation}
dds.subset <- DESeqDataSetFromMatrix(countData = countmatrix2.subset,
                                      colData = metadata.subset,
                                      design = ~ PlatinumSensitivity)
```
<!-- ### Pre-filtering --> Don't need to filter - this is done automatically
<!-- This step removes genes with low expression to increase multiple comparison power. -->

<!-- ```{r} -->
<!-- keep <- rowSums(counts(dds)) >= 500 -->
<!-- dds <- dds[keep,] -->
<!-- nrow(dds) -->
<!-- ``` -->


### Run DESeq2 
```{r DESeq2}
path = paste0(deseq_Rdata_folder,
              "/dds_HGSC_subset.RData")

# Run DESeq
dds.subset <- DESeq(dds.subset)
save(dds.subset, file = path)

# Alternatively, if this has already been run, can load dds object
# load(path)
```

### Print DESeq results
```{r create results table}
res.subset <- results(dds.subset, contrast=c("PlatinumSensitivity", "resistant", "sensitive"))
res.subset

path = paste0(deseq_output_folder,
              "/DESeq_HGSC_subset.csv")
write.csv(as.data.frame(res.subset), file = path)
```

### Filter DESeq2 results for significant genes
Filter res.subset for padj < 0.05 and |log2FC| >= 1.2
```{r filter res.subset}
res.subset.filtered <- as.data.frame(res.subset) %>%
  filter(padj<0.05)%>%
  filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.subset.filtered
```
## Data QC

### Data transformation 
Here we performed normal transformation [log2(n+1)], variance stabilized transformation, and regularized log tranformation to improve visualization of the data values. To speed up subsequent re-runs, we have hidden analysis for non-vst.

```{r norm transformation and stdev plot}
# ntd <- normTransform(dds.subset)
# meanSdPlot(assay(ntd))
```


```{r var stab transformation and stdev plot}
vsd.subset <- vst(dds.subset)
meanSdPlot(assay(vsd.subset))
vsd.subset.df = as.data.frame(assay(vsd.subset))
```


```{r reg log transformation and stdev plot}
# rld <- rlog(dds.subset)
# meanSdPlot(assay(rld))
```

Based on this data, variance-stabilized transformation lead to the lowest standard deviation between samples and was mostly located towards the high expression transcripts, as might be expected.

### PCA plot
```{r vsd PCA}
pcaData.subset <- plotPCA(vsd.subset, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData.subset, "percentVar"))
myColors <- c("#76AB7E", "#63E678", "#1D32FB", "#7B87FD", "#01BD1F", "#E8A426", "#0B7C1D", "#BB19E7")
names(myColors) <- levels(vsd.subset$Subtype)
colScale <- scale_colour_manual(name = "Subtype",values = myColors)
pca.subset <- ggplot(pcaData.subset, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
  geom_point(size=3) +
  geom_text(hjust=0, vjust=0) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  theme_classic() + 
  colScale
pca.subset
path = paste0(deseq_output_folder,
              "/pca_HGSC_subset.pdf")
ggsave(path, pca.subset)
```
PCA plot groups samples roughly by subtype

<!-- ### Sample-wise correlation -->

<!-- ```{r, fig.height=4, fig.width= 8} -->
<!-- vsd.subset.mod = vsd.subset.df -->
<!-- colnames(vsd.subset.mod) <- metadata.subset$CellLine[match(colnames(vsd.subset.mod), metadata.subset$files)] -->
<!-- vsd.subset.mod$gene <- row.names(vsd.subset.mod) -->
<!-- vsd.subset.mod <- vsd.subset.mod[!is.na(names(vsd.subset.mod))] -->


<!-- corr <- vsd.subset.mod %>% -->
<!--   select(-gene) %>% -->
<!--   cor(method = "spearman") -->

<!-- annotation = metadata.subset %>% -->
<!--   filter(Subtype == "HGSC") -->

<!-- row.names(annotation) = metadata.subset$CellLine[match(row.names(annotation), metadata.subset$files)] -->
<!-- annotation = annotation[, c("IC50", "PlatinumSensitivity")] -->

<!-- pheatmap(corr, annotation_col=annotation) -->
<!-- ``` -->


### Plot heatmaps to check sample to sample variability
Plotting the top 100 most highly expressed genes:
```{r heatmaps}
select <- order(rowMeans(counts(dds.subset,normalized=TRUE)),
                decreasing=TRUE)[1:100]
df <- as.data.frame(colData(dds.subset)[,c("Subtype", "PlatinumSensitivity")])
pheatmap(assay(vsd.subset)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         labels_col=colData(dds.subset)[,c("CellLine")],annotation_col=df)
```

### Dendrogram based on gene expression
Pull genes contributing to principal components 1 & 2
```{r}
# Not sure what this was for:
# PCA <- prcomp(TPM.log, scale=TRUE)
# PCA.mat <- as.data.frame(PCA$x)
# PCA.PC1filt <- PCA.mat %>% filter(PC1 < quantile(PCA.mat[,"PC1"], .2)[[1]])
```


```{r}
TPM3 <- TPM2.subset[rowSums(TPM2.subset)>1000,]
counts.sc <- t(TPM3)
dist <- dist(counts.sc)
clust <- hclust(dist, method="average")
dend1.subset <- as.dendrogram(clust)
```

```{r, fig.width=10}
par(mar=c(10,2,1,1))
my_colors <- ifelse(metadata.subset$PlatinumSensitivity=="sensitive", "red", 
                    ifelse(metadata.subset$PlatinumSensitivity=="resistant", "blue", 
                           ifelse(metadata.subset$PlatinumSensitivity=="intermediate", "yellow", "white" )))
plot(dend1.subset)
colored_bars(colors = my_colors, dend = dend1.subset, rowLabels = "PlatinumSensitivity")
```

I tried a lot of different iterations of this (including: various cutoffs for variance of genes, genes contributing most to first and second principal components, more highly expressed genes, TPM vs vst data, clustering methods), but the fundamental problem is that isogenic pairs rarely cluster anywhere near each other, even though the PCA analysis shows this relationship. I don't know that this is a reliable method for determining relatedness/subtyping, unless a robust gene set is developed.(This is from Jessi)

## Data visualization

### Plot Heatmap of top differentially expressed genes
```{r plot heatmap, fig.height = 8, fig.width = 8}
vsd.subset.df <-as.data.frame(assay(vsd.subset))
pheatmap(vsd.subset.df[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))
```

### Heatmap based on TPM
```{r, fig.height = 8, fig.width = 8}
pheatmap(TPM.log.subset[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")], annotation_col=df, color=colorRampPalette(c("white", "red"))(10))
```

### Heatmap with mean-centered data
```{r, fig.height=10, fig.width=8}
center_scale <- function(x) {
  scale(x, scale=FALSE)
}
vsd.subset.meancenter <- apply(vsd.subset.df, 1, center_scale)
vsd.subset.meancenter <-t(vsd.subset.meancenter)
colnames(vsd.subset.meancenter) <- colnames(vsd.subset.df)
vsd.subset.meancenter <- as.data.frame(vsd.subset.meancenter) 
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
extreme_val = max(
  abs(min(vsd.subset.meancenter[rownames(res.subset.filtered),])),
  abs(max(vsd.subset.meancenter[rownames(res.subset.filtered),]))
)
myBreaks <- c(seq(-1 * extreme_val, 0, length.out=ceiling(50/2) + 1), 
              seq(extreme_val/50, extreme_val, length.out=floor(50/2)))

plotHeatmap = pheatmap(vsd.subset.meancenter[rownames(res.subset.filtered),], labels_col=colData(dds.subset)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks)

print(plotHeatmap)
ggsave(filename = paste0(deseq_plots_folder, "/differential_hgsc_subset_heatmap.svg"), plot = plotHeatmap, device = "svg")
```

### Volcano Plots
```{r, fig.height=7, fig.width=7}
# These genes chosen by Jessi for the purpose of figures in paper
selected_labels = c("SULT1A3", "CCL5", "FN1", "CD70", "IL1RN", "ZEB2", "A2M-AS1", "TAP2", "CEBPA", "BARX1", "COL2A1", "CT45A5")
labels = ifelse(rownames(res.subset) %in% selected_labels,
               rownames(res.subset),
               "")
# test_points = res.subset[rownames(res.subset) %in% selected_labels,]
# print(as.data.frame(test_points))
# labels = rownames(test_points)

plotVolcano = res.subset %>% 
  ggplot(aes(x=log2FoldChange, y=-log10(padj), label=labels)) +
  geom_point(alpha=0.5) +
  theme_minimal() +
  scale_color_manual(values = c("black", "blue", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  geom_text_repel(segment.size = .2, size = 2, max.overlaps = 1000, force = 10, force_pull = 1, nudge_y = 1)
  # geom_text_repel(segment.size = .2, size = 2, max.overlaps = 10, force = 1, force_pull = 60, nudge_y = 1)
  

print(plotVolcano)
ggsave(filename = paste0(deseq_plots_folder, "/differential_hgsc_subset_volcano.svg"), plot = plotVolcano, device = "svg")
```

## Gene ontology

### GSEA of genes differentially expressed in resistant and sensitive lines

Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes. 

Using a padj Cutoff of 0.15.

```{r}
gene_list.subset <- res.subset$log2FoldChange
names(gene_list.subset) <- rownames(res.subset)
gene_list.subset <- sort(gene_list.subset, decreasing = TRUE)

# Set the seed so our results are reproducible:
set.seed(2024)
gsea_res.subset <- gseGO(gene_list.subset, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.15)

# Format output
gsea_res_df.subset <- as.data.frame(gsea_res.subset)
gsea_res_df.subset <- gsea_res_df.subset %>%
  mutate(original_row_num = row_number())
gsea_res_df.subset <- gsea_res_df.subset[order(gsea_res_df.subset$NES, decreasing = TRUE),]
row.names(gsea_res_df.subset) <- gsea_res_df.subset$ID
```

NES is the normalized enrichment score.
```{r}
gsea_res_df_short.subset <- gsea_res_df.subset[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short.subset$"core_enrichment_genes" <- gsea_res_df.subset$core_enrichment
```

Volcano Plot of gene ontology (Average NES & adjusted p value)

```{r Volcano gene ontology}
as.data.frame(gsea_res_df_short.subset) %>%
  ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.subset))) +
  geom_point() +
  scale_alpha_manual(0.5) +
  theme_minimal() +
  geom_text_repel() +
  geom_hline(yintercept = 1.301) +
  geom_vline(xintercept = 1.2) +
  geom_vline(xintercept = -1.2) +
  xlim(-10, 10)
```

#### Upregulated pathways

```{r}
gsea_res_df_short.up.subset <- subset(gsea_res_df_short.subset, gsea_res_df_short.subset$NES >= 0)
gsea_res_df_short.up.subset = gsea_res_df_short.up.subset[order(gsea_res_df_short.up.subset$p.adjust), ]
path = paste0(deseq_output_folder,
              "/subset_cisplatin_resistant_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up.subset, file = path)
gsea_res_df_short.up.subset
```

Use Revigo to cluster upregulated pathways with pvalue < 0.15

```{r upreg cluster}
revigo_input.cellline.up.subset <- gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.15,][c("p.adjust")]
rownames(revigo_input.cellline.up.subset) <- rownames(gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.15,])

simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up.subset),
  orgdb = "org.Hs.eg.db",
  ont = "BP",
  method = "Rel"
)
scores <- setNames(-log10(revigo_input.cellline.up.subset$p.adjust), rownames(revigo_input.cellline.up.subset))

if (nrow(revigo_input.cellline.up.subset) > 1) {
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = "org.Hs.eg.db"
  )
} else {
  reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
  print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}

```

Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).

```{r upreg cluster treemap}
if (nrow(reducedTerms) > 2) {
  treemapPlot(reducedTerms)
}
```

Use Revigo to cluster upregulated pathways with pvalue < 0.05

```{r upreg cluster lower p}
revigo_input.cellline.up.subset <- gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.05,][c("p.adjust")]
rownames(revigo_input.cellline.up.subset) <- rownames(gsea_res_df_short.up.subset[gsea_res_df_short.up.subset$p.adjust < 0.05,])

simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up.subset),
  orgdb = "org.Hs.eg.db",
  ont = "BP",
  method = "Rel"
)
scores <- setNames(-log10(revigo_input.cellline.up.subset$p.adjust), rownames(revigo_input.cellline.up.subset))

if (nrow(revigo_input.cellline.up.subset) > 1) {
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = "org.Hs.eg.db"
  )
} else {
  reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
  print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}

```

Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).

```{r upreg cluster treemap low p}
if (nrow(reducedTerms) > 2) {
  treemapPlot(reducedTerms)
}
```

#### Downregulated pathways

```{r}
gsea_res_df_short.down.subset <- subset(gsea_res_df_short.subset, gsea_res_df_short.subset$NES <= 0)
path = paste0(deseq_output_folder,
              "/subset_cisplatin_resistant_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down.subset, file = path)
gsea_res_df_short.down.subset
```

Use Revigo to cluster downregulated pathways with pvalue < 0.15

```{r downreg cluster}
revigo_input.cellline.down.subset <- gsea_res_df_short.down.subset[gsea_res_df_short.down.subset$p.adjust < 0.15,][c("p.adjust")]
rownames(revigo_input.cellline.down.subset) <- rownames(gsea_res_df_short.down.subset[gsea_res_df_short.down.subset$p.adjust < 0.15,])

simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down.subset),
  orgdb = "org.Hs.eg.db",
  ont = "BP",
  method = "Rel"
)
scores <- setNames(-log10(revigo_input.cellline.down.subset$p.adjust), rownames(revigo_input.cellline.down.subset))

if (nrow(revigo_input.cellline.down.subset) > 1) {
  reducedTerms <- reduceSimMatrix(simMatrix,
    scores,
    threshold = 0.7,
    orgdb = "org.Hs.eg.db"
  )
} else {
  reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
  print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}

```

Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).

```{r downreg cluster treemap}
if (nrow(reducedTerms) > 2) {
  treemapPlot(reducedTerms)
}
```
  